!git clone https://github.com/ntasfi/PyGame-Learning-Environment
!cd PyGame-Learning-Environment
!pip install -e . # under PyGame-Learning-Environment directory
!pip install pygame==1.9.6
from ple.games.flappybird import FlappyBird
from ple import PLE
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline
os.environ["SDL_VIDEODRIVER"] = "dummy" # this line disable pop-out window
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False) # environment interface to game
env.reset_game()
# return a dictionary whose key is action description and value is action index
print(game.actions)
# return a list of action index (include None)
print(env.getActionSet())
{'up': 119}
[119, None]
# a dictionary describe state
'''
player y position.
players velocity.
next pipe distance to player
next pipe top y position
next pipe bottom y position
next next pipe distance to player
next next pipe top y position
next next pipe bottom y position
'''
game.getGameState()
{'player_y': 256,
'player_vel': 0,
'next_pipe_dist_to_player': 309.0,
'next_pipe_top_y': 144,
'next_pipe_bottom_y': 244,
'next_next_pipe_dist_to_player': 453.0,
'next_next_pipe_top_y': 160,
'next_next_pipe_bottom_y': 260}
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.5
class Agent:
def __init__(self,
bucket_range_per_feature,
num_action,
t=0,
discount_factor=0.99):
self.update_parameters(t) # init explore rate and learning rate
self.q_table = defaultdict(lambda: np.zeros(num_action))
self.discount_factor = discount_factor
self.num_action = num_action
# how to discretize each feature in a state
# the higher each value, less time to train but with worser performance
# e.g. if range = 2, feature with value 1 is equal to feature with value 0 bacause int(1/2) = int(0/2)
self.bucket_range_per_feature = bucket_range_per_feature
def select_action(self, state):
# epsilon-greedy
state_idx = self.get_state_idx(state)
if np.random.rand() < self.exploring_rate:
action = np.random.choice(num_action) # Select a random action
else:
action = np.argmax(
self.q_table[state_idx]) # Select the action with the highest q
return action
def update_policy(self, state, action, reward, state_prime):
state_idx = self.get_state_idx(state)
state_prime_idx = self.get_state_idx(state_prime)
# Update Q_value using Q-learning update rule
# best_q = np.max(self.q_table[state_prime_idx])
# self.q_table[state_idx][action] += self.learning_rate * (
# reward + self.discount_factor * best_q - self.q_table[state_idx][action])
# ---------------To-Do------------------
# Update Q_value using SARSA update rule
next_action = self.select_action(state_prime)
self.q_table[state_idx][action] += self.learning_rate * (
reward + self.discount_factor * self.q_table[state_prime_idx][next_action]
- self.q_table[state_idx][action])
def get_state_idx(self, state):
# instead of using absolute position of pipe, use relative position
state = copy.deepcopy(state)
state['next_next_pipe_bottom_y'] -= state['player_y']
state['next_next_pipe_top_y'] -= state['player_y']
state['next_pipe_bottom_y'] -= state['player_y']
state['next_pipe_top_y'] -= state['player_y']
# sort to make list converted from dict ordered in alphabet order
state_key = [k for k, v in sorted(state.items())]
# do bucketing to decrease state space to speed up training
state_idx = []
for key in state_key:
state_idx.append(
int(state[key] / self.bucket_range_per_feature[key]))
return tuple(state_idx)
def update_parameters(self, episode):
self.exploring_rate = max(MIN_EXPLORING_RATE,
min(0.5, 0.99**((episode) / 30)))
self.learning_rate = max(MIN_LEARNING_RATE, min(0.5, 0.99
** ((episode) / 30)))
def shutdown_explore(self):
# make action selection greedy
self.exploring_rate = 0
num_action = len(env.getActionSet())
bucket_range_per_feature = {
'next_next_pipe_bottom_y': 40,
'next_next_pipe_dist_to_player': 512,
'next_next_pipe_top_y': 40,
'next_pipe_bottom_y': 20,
'next_pipe_dist_to_player': 20,
'next_pipe_top_y': 20,
'player_vel': 4,
'player_y': 16
}
# init agent
agent = Agent(bucket_range_per_feature, num_action)
import moviepy.editor as mpy
def make_anim(images, fps=60, true_image=False):
duration = len(images) / fps
def make_frame(t):
try:
x = images[int(len(images) / duration * t)]
except:
x = images[-1]
if true_image:
return x.astype(np.uint8)
else:
return ((x + 1) / 2 * 255).astype(np.uint8)
clip = mpy.VideoClip(make_frame, duration=duration)
clip.fps = fps
return clip
from IPython.display import Image, display
reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# update agent
agent.update_policy(state, action, reward, state_prime)
# Setting up for the next iteration
state = state_prime
t += 1
# update exploring_rate and learning_rate
agent.update_parameters(episode)
if episode % print_every_episode == 0:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
agent.exploring_rate,
agent.learning_rate
))
reward_per_epoch.append(cum_reward)
exploring_rates.append(agent.exploring_rate)
learning_rates.append(agent.learning_rate)
lifetime_per_epoch.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 61 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5 Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.04147740932356356, learning rate: 0.5 Episode 10000 finished after 64 time steps, cumulated reward: -5.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 65 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 61 time steps, cumulated reward: -5.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 288 time steps, cumulated reward: 1.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 107 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 111 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 15000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 len frames: 99 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 68 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 172 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 332 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 586 time steps, cumulated reward: 9.0, exploring rate: 0.01, learning rate: 0.5 Episode 20000 finished after 243 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 len frames: 244 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 94 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 46 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 67 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 25000 finished after 514 time steps, cumulated reward: 7.0, exploring rate: 0.01, learning rate: 0.5 len frames: 515 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 471 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 27500 finished after 706 time steps, cumulated reward: 13.0, exploring rate: 0.01, learning rate: 0.5 Episode 28000 finished after 473 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 28500 finished after 1490 time steps, cumulated reward: 33.0, exploring rate: 0.01, learning rate: 0.5 Episode 29000 finished after 60 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 Episode 29500 finished after 260 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 30000 finished after 1607 time steps, cumulated reward: 37.0, exploring rate: 0.01, learning rate: 0.5 len frames: 1608 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 30500 finished after 1002 time steps, cumulated reward: 20.0, exploring rate: 0.01, learning rate: 0.5 Episode 31000 finished after 157 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 31500 finished after 103 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 32000 finished after 1829 time steps, cumulated reward: 42.0, exploring rate: 0.01, learning rate: 0.5 Episode 32500 finished after 636 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 33000 finished after 2590 time steps, cumulated reward: 63.0, exploring rate: 0.01, learning rate: 0.5 Episode 33500 finished after 449 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 34000 finished after 3646 time steps, cumulated reward: 91.0, exploring rate: 0.01, learning rate: 0.5 Episode 34500 finished after 853 time steps, cumulated reward: 16.0, exploring rate: 0.01, learning rate: 0.5 Episode 35000 finished after 3524 time steps, cumulated reward: 87.0, exploring rate: 0.01, learning rate: 0.5 len frames: 3525 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 35500 finished after 218 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 36000 finished after 4505 time steps, cumulated reward: 113.0, exploring rate: 0.01, learning rate: 0.5 Episode 36500 finished after 3085 time steps, cumulated reward: 76.0, exploring rate: 0.01, learning rate: 0.5 Episode 37000 finished after 499 time steps, cumulated reward: 7.0, exploring rate: 0.01, learning rate: 0.5 Episode 37500 finished after 5455 time steps, cumulated reward: 139.0, exploring rate: 0.01, learning rate: 0.5 Episode 38000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 38500 finished after 338 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 39000 finished after 3036 time steps, cumulated reward: 74.0, exploring rate: 0.01, learning rate: 0.5 Episode 39500 finished after 3072 time steps, cumulated reward: 75.0, exploring rate: 0.01, learning rate: 0.5
def demo():
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# shutdown exploration to see performance of greedy action
agent.shutdown_explore()
# the initial state
state = game.getGameState()
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# observe the result
state_prime = game.getGameState() # get next state
# Setting up for the next iteration
state = state_prime
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
demo()
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
def plot(log, title, y_label):
fig, _ = plt.subplots(figsize=(20, 5))
plt.plot(range(len(log)), log)
plt.title(title)
plt.xlabel('episode')
plt.ylabel(y_label)
fig.tight_layout()
plt.show()
plot(exploring_rates, 'Exploring Rate', 'exploring rate')
plot(learning_rates, 'Learning Rate', 'learning rate')
method = 'Q-Learning'
plot(lifetime_per_epoch, f'{method} Lifetime per Epoch', 'lifetime per epoch')
plot(reward_per_epoch, f'{method} Reward per Epoch', 'reward per epoch')
# init agent
agent = Agent(bucket_range_per_feature, num_action)
from IPython.display import Image, display
reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 500
show_gif_every_episode = 5000
NUM_EPISODE = 40000
for episode in range(0, NUM_EPISODE):
# Reset the environment
env.reset_game()
# record frame
frames = [env.getScreenRGB()]
# for every 500 episodes, shutdown exploration to see performance of greedy action
if episode % print_every_episode == 0:
agent.shutdown_explore()
# the initial state
state = game.getGameState()
# cumulate reward for this episode
cum_reward = 0
t = 0
while not env.game_over():
# select an action
action = agent.select_action(state)
# execute the action and get reward
# reward = +1 when pass a pipe, -5 when die
reward = env.act(env.getActionSet()[action])
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = game.getGameState() # get next state
# update agent
agent.update_policy(state, action, reward, state_prime)
# Setting up for the next iteration
state = state_prime
t += 1
# update exploring_rate and learning_rate
agent.update_parameters(episode)
if episode % print_every_episode == 0:
print("Episode {} finished after {} time steps, cumulated reward: {}, exploring rate: {}, learning rate: {}".format(
episode,
t,
cum_reward,
agent.exploring_rate,
agent.learning_rate
))
reward_per_epoch.append(cum_reward)
exploring_rates.append(agent.exploring_rate)
learning_rates.append(agent.learning_rate)
lifetime_per_epoch.append(t)
# for every 5000 episode, record an animation
if episode % show_gif_every_episode == 0:
print("len frames:", len(frames))
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
display(clip.ipython_display(fps=60, autoplay=1, loop=1))
Episode 0 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 1500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.5, learning rate: 0.5 Episode 2500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.43277903725889943, learning rate: 0.5 Episode 3000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.3660323412732292, learning rate: 0.5 Episode 3500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.30957986252419073, learning rate: 0.5 Episode 4000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.26183394327157605, learning rate: 0.5 Episode 4500 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.22145178723886091, learning rate: 0.5 Episode 5000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.18729769509073985, learning rate: 0.5 len frames: 63 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 5500 finished after 67 time steps, cumulated reward: -4.0, exploring rate: 0.15841112426184903, learning rate: 0.5 Episode 6000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.13397967485796172, learning rate: 0.5 Episode 6500 finished after 44 time steps, cumulated reward: -5.0, exploring rate: 0.11331624189077398, learning rate: 0.5 Episode 7000 finished after 62 time steps, cumulated reward: -5.0, exploring rate: 0.09583969128049684, learning rate: 0.5 Episode 7500 finished after 76 time steps, cumulated reward: -4.0, exploring rate: 0.08105851616218128, learning rate: 0.5 Episode 8000 finished after 50 time steps, cumulated reward: -5.0, exploring rate: 0.0685570138491429, learning rate: 0.5 Episode 8500 finished after 76 time steps, cumulated reward: -4.0, exploring rate: 0.05798359469728905, learning rate: 0.5 Episode 9000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.04904089407128572, learning rate: 0.5 Episode 9500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.04147740932356356, learning rate: 0.5 Episode 10000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.03508042658630376, learning rate: 0.5 len frames: 99 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 10500 finished after 70 time steps, cumulated reward: -4.0, exploring rate: 0.029670038450977102, learning rate: 0.5 Episode 11000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.02509408428990297, learning rate: 0.5 Episode 11500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.021223870922486707, learning rate: 0.5 Episode 12000 finished after 72 time steps, cumulated reward: -4.0, exploring rate: 0.017950553275045137, learning rate: 0.5 Episode 12500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.015182073244652034, learning rate: 0.5 Episode 13000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.012840570676248398, learning rate: 0.5 Episode 13500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.010860193639877882, learning rate: 0.5 Episode 14000 finished after 72 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 14500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 15000 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 len frames: 361 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 15500 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 16000 finished after 107 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 16500 finished after 150 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 17000 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 17500 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 18000 finished after 284 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 18500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 19000 finished after 889 time steps, cumulated reward: 17.0, exploring rate: 0.01, learning rate: 0.5 Episode 19500 finished after 178 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 20000 finished after 224 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 len frames: 225 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 20500 finished after 139 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 21000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 21500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 22000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 22500 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 Episode 23000 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 23500 finished after 247 time steps, cumulated reward: 0.0, exploring rate: 0.01, learning rate: 0.5 Episode 24000 finished after 586 time steps, cumulated reward: 9.0, exploring rate: 0.01, learning rate: 0.5 Episode 24500 finished after 175 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 25000 finished after 134 time steps, cumulated reward: -3.0, exploring rate: 0.01, learning rate: 0.5 len frames: 135 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 25500 finished after 663 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 26000 finished after 663 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 26500 finished after 663 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 27000 finished after 211 time steps, cumulated reward: -1.0, exploring rate: 0.01, learning rate: 0.5 Episode 27500 finished after 642 time steps, cumulated reward: 11.0, exploring rate: 0.01, learning rate: 0.5 Episode 28000 finished after 1320 time steps, cumulated reward: 29.0, exploring rate: 0.01, learning rate: 0.5 Episode 28500 finished after 1228 time steps, cumulated reward: 26.0, exploring rate: 0.01, learning rate: 0.5 Episode 29000 finished after 514 time steps, cumulated reward: 7.0, exploring rate: 0.01, learning rate: 0.5 Episode 29500 finished after 360 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 30000 finished after 59 time steps, cumulated reward: -5.0, exploring rate: 0.01, learning rate: 0.5 len frames: 60 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 30500 finished after 328 time steps, cumulated reward: 3.0, exploring rate: 0.01, learning rate: 0.5 Episode 31000 finished after 1534 time steps, cumulated reward: 34.0, exploring rate: 0.01, learning rate: 0.5 Episode 31500 finished after 163 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 32000 finished after 324 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 32500 finished after 149 time steps, cumulated reward: -2.0, exploring rate: 0.01, learning rate: 0.5 Episode 33000 finished after 3262 time steps, cumulated reward: 80.0, exploring rate: 0.01, learning rate: 0.5 Episode 33500 finished after 98 time steps, cumulated reward: -4.0, exploring rate: 0.01, learning rate: 0.5 Episode 34000 finished after 889 time steps, cumulated reward: 17.0, exploring rate: 0.01, learning rate: 0.5 Episode 34500 finished after 437 time steps, cumulated reward: 5.0, exploring rate: 0.01, learning rate: 0.5 Episode 35000 finished after 1809 time steps, cumulated reward: 42.0, exploring rate: 0.01, learning rate: 0.5 len frames: 1810 Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
Episode 35500 finished after 288 time steps, cumulated reward: 1.0, exploring rate: 0.01, learning rate: 0.5 Episode 36000 finished after 327 time steps, cumulated reward: 2.0, exploring rate: 0.01, learning rate: 0.5 Episode 36500 finished after 473 time steps, cumulated reward: 6.0, exploring rate: 0.01, learning rate: 0.5 Episode 37000 finished after 3679 time steps, cumulated reward: 91.0, exploring rate: 0.01, learning rate: 0.5 Episode 37500 finished after 368 time steps, cumulated reward: 4.0, exploring rate: 0.01, learning rate: 0.5 Episode 38000 finished after 3235 time steps, cumulated reward: 80.0, exploring rate: 0.01, learning rate: 0.5 Episode 38500 finished after 2810 time steps, cumulated reward: 68.0, exploring rate: 0.01, learning rate: 0.5 Episode 39000 finished after 926 time steps, cumulated reward: 18.0, exploring rate: 0.01, learning rate: 0.5 Episode 39500 finished after 553 time steps, cumulated reward: 8.0, exploring rate: 0.01, learning rate: 0.5
demo()
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
plot(exploring_rates, 'Exploring Rate', 'exploring rate')
plot(learning_rates, 'Learning Rate', 'learning rate')
method = 'SARSA'
plot(lifetime_per_epoch, f'{method} Lifetime per Epoch', 'lifetime per epoch')
plot(reward_per_epoch, f'{method} Reward per Epoch', 'reward per epoch')
from IPython.display import Video
print('Q-Learning Demo')
Video('./output/Demo Q-learning.mp4')
Q-Learning Demo
print('SARSA Demo')
Video('./output/Demo SARSA.mp4')
SARSA Demo
可以從兩個方法的Demo影片中看出,兩個Agent做出的動作選擇並沒有相差太多,因為環境的設定是通過一個水管就有獎勵,但並沒有其他的懲罰,若是加上懲罰(如: 飛一下-1),那Q-Learning訓練出來的agent應該會有更多鋌而走險的走法(快碰到水管邊緣)。
*影片無法播放的話,上面有兩個方法各自的Demo影片,我也有在雲端中放上影片及下面的圖表。
import matplotlib.pyplot as plt
import matplotlib.image as img
img1 = img.imread('./output/Q-learning_lifetime.png')
img2 = img.imread('./output/SARSA_lifetime.png')
fig, axes = plt.subplots(2, figsize=(18, 10))
for ax, image in zip(axes, [img1, img2]):
ax.imshow(image)
ax.imshow(image)
ax.axis('off')
plt.tight_layout()
plt.show()
img1 = img.imread('./output/Q-learning_reward.png')
img2 = img.imread('./output/SARSA_reward.png')
fig, axes = plt.subplots(2, figsize=(18, 10))
for ax, image in zip(axes, [img1, img2]):
ax.imshow(image)
ax.imshow(image)
ax.axis('off')
plt.tight_layout()
plt.show()
不管是在Lifetime還是Reward的結果,兩個方法雖然都有明顯高低震盪,但Q-learning平均都比SARSA來的好,可能是在學習過程中,Q-learning傾向會去找下一步最好的動作去學習,讓整體的表現都優於SARSA。